options(width = 100)
#Load the libraries required
library(jsonlite)
library(lattice)
library(ggmap)
library(sp)
The data set is “Seattle Police Department 911 Incident Response.”
People have called 911 at different times in the city of Seattle for various incidents. The incidents are described by the column “initial_type_subgroup” which are further subgrouped under“initial_type_description”.
The SPD has divided different areas in the city by “district_sectors” which are further subdivided into smaller zones indicated by the column “zone_beat”. Also, each incident has been given a reference number indicated by the column name “cad_event_number” and “cad_cdw_id”.
Depending on where the crime was reported from, a latitude and longitude were collected using which we can get the approximate location of where the incident/crime occurred. The assumption here is that the 911 call was made from the same location or from a location close to the scene of the crime. The variable “at_scene_time” gives us a time when the incident/crime was reported through 911 to the SPD.
#Get the data from Socrata's open data website and save it in a variable called spd911
spd911 <- fromJSON("https://data.seattle.gov/resource/3k2p-39jp.json")
#Look at the top 6 rows of the data frame
head(spd911)
## cad_event_number cad_cdw_id zone_beat initial_type_description
## 1 15000035997 581875 K3 AUTO THEFT - VEH THEFT OR THEFT AND RECOVERY
## 2 15000035929 581910 Q1 NARCOTICS - VIOLATIONS (LOITER, USE, SELL, NARS)
## 3 15000035487 582157 K3 FOOT - ELUDING POLICE
## 4 15000035390 582215 F2 AUTO RECOVERY
## 5 15000035285 582277 L1 AUTO RECOVERY
## 6 15000035205 582326 F1 AUTO RECOVERY
## district_sector initial_type_subgroup incident_location.needs_recoding
## 1 K AUTO THEFTS FALSE
## 2 Q NARCOTICS COMPLAINTS FALSE
## 3 K TRAFFIC RELATED CALLS FALSE
## 4 F AUTO THEFTS FALSE
## 5 L AUTO THEFTS FALSE
## 6 F AUTO THEFTS FALSE
## incident_location.longitude incident_location.latitude hundred_block_location
## 1 -122.330271593 47.600875809 3 AV S / S WASHINGTON ST
## 2 -122.37613941 47.636336049 20XX BLOCK OF 15 AV W
## 3 -122.326350868 47.601708802 6 AV / YESLER WY
## 4 -122.363172642 47.525585666 86XX BLOCK OF 24 AV SW
## 5 -122.304248161 47.727498035 135XX BLOCK OF 23 AV NE
## 6 -122.369833395 47.546493546 63XX BLOCK OF 29 AV SW
## general_offense_number longitude latitude at_scene_time initial_type_group
## 1 201535997 -122.330271593 47.600875809 2015-02-01T00:20:00 AUTO RECOVERIES
## 2 201535929 -122.376139410 47.636336049 2015-01-31T23:12:00 NARCOTICS COMPLAINTS
## 3 201535487 -122.326350868 47.601708802 2015-01-31T15:14:00 TRAFFIC RELATED CALLS
## 4 201535390 -122.363172642 47.525585666 2015-01-31T13:36:00 AUTO RECOVERIES
## 5 201535285 -122.304248161 47.727498035 2015-01-31T12:08:00 AUTO RECOVERIES
## 6 201535205 -122.369833395 47.546493546 2015-01-31T10:24:00 AUTO RECOVERIES
## census_tract event_clearance_code event_clearance_subgroup event_clearance_group
## 1 9200.2014 <NA> <NA> <NA>
## 2 5802.2003 <NA> <NA> <NA>
## 3 9200.1002 <NA> <NA> <NA>
## 4 11401.2005 <NA> <NA> <NA>
## 5 200.6017 <NA> <NA> <NA>
## 6 10700.4001 <NA> <NA> <NA>
## event_clearance_description
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
#Look at all the column names in the data frame
colnames(spd911)
## [1] "cad_event_number" "cad_cdw_id" "zone_beat"
## [4] "initial_type_description" "district_sector" "initial_type_subgroup"
## [7] "incident_location" "hundred_block_location" "general_offense_number"
## [10] "longitude" "latitude" "at_scene_time"
## [13] "initial_type_group" "census_tract" "event_clearance_code"
## [16] "event_clearance_subgroup" "event_clearance_group" "event_clearance_description"
#Look at the structure of the data frame
str(spd911)
## 'data.frame': 1000 obs. of 18 variables:
## $ cad_event_number : chr "15000035997" "15000035929" "15000035487" "15000035390" ...
## $ cad_cdw_id : chr "581875" "581910" "582157" "582215" ...
## $ zone_beat : chr "K3" "Q1" "K3" "F2" ...
## $ initial_type_description : chr "AUTO THEFT - VEH THEFT OR THEFT AND RECOVERY" "NARCOTICS - VIOLATIONS (LOITER, USE, SELL, NARS)" "FOOT - ELUDING POLICE" "AUTO RECOVERY" ...
## $ district_sector : chr "K" "Q" "K" "F" ...
## $ initial_type_subgroup : chr "AUTO THEFTS" "NARCOTICS COMPLAINTS" "TRAFFIC RELATED CALLS" "AUTO THEFTS" ...
## $ incident_location :'data.frame': 1000 obs. of 3 variables:
## ..$ needs_recoding: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..$ longitude : chr "-122.330271593" "-122.37613941" "-122.326350868" "-122.363172642" ...
## ..$ latitude : chr "47.600875809" "47.636336049" "47.601708802" "47.525585666" ...
## $ hundred_block_location : chr "3 AV S / S WASHINGTON ST" "20XX BLOCK OF 15 AV W" "6 AV / YESLER WY" "86XX BLOCK OF 24 AV SW" ...
## $ general_offense_number : chr "201535997" "201535929" "201535487" "201535390" ...
## $ longitude : chr "-122.330271593" "-122.376139410" "-122.326350868" "-122.363172642" ...
## $ latitude : chr "47.600875809" "47.636336049" "47.601708802" "47.525585666" ...
## $ at_scene_time : chr "2015-02-01T00:20:00" "2015-01-31T23:12:00" "2015-01-31T15:14:00" "2015-01-31T13:36:00" ...
## $ initial_type_group : chr "AUTO RECOVERIES" "NARCOTICS COMPLAINTS" "TRAFFIC RELATED CALLS" "AUTO RECOVERIES" ...
## $ census_tract : chr "9200.2014" "5802.2003" "9200.1002" "11401.2005" ...
## $ event_clearance_code : chr NA NA NA NA ...
## $ event_clearance_subgroup : chr NA NA NA NA ...
## $ event_clearance_group : chr NA NA NA NA ...
## $ event_clearance_description: chr NA NA NA NA ...
Since our analyses does not include all the columns we would like to get rid of the data that is not important for our analyses. This does not mean that the data is of no use. The removed data can be used in another analyses.
We first remove all such columns and then we perform further data cleaning by type casting certain columns to appropriate data types which would make analysis easy for us
#These are the columns that are of least use to us and for our analysis we do not require them, hence we simply remove these columns
spd911$event_clearance_code<-NULL
spd911$event_clearance_group<-NULL
spd911$event_clearance_subgroup<-NULL
spd911$event_clearance_description<-NULL
spd911$incident_location<-NULL
#After performing some basic data cleaning we also need to perform type casting to ensure that all the columns in the data frame can be used appropriately
spd911$cad_cdw_id<-as.integer(spd911$cad_cdw_id)
spd911$general_offense_number<-as.integer(spd911$general_offense_number)
spd911$district_sector<-as.factor(spd911$district_sector)
spd911$longitude<-as.numeric(spd911$longitude)
spd911$latitude<-as.numeric(spd911$latitude)
#Since there is an additional "T" in the time we need to replace it by a blank space
spd911$at_scene_time<-gsub("T"," ",spd911$at_scene_time)
spd911$at_scene_time<-as.POSIXct(spd911$at_scene_time)
#Look at the structure of the cleaned data frame
str(spd911)
## 'data.frame': 1000 obs. of 13 variables:
## $ cad_event_number : chr "15000035997" "15000035929" "15000035487" "15000035390" ...
## $ cad_cdw_id : int 581875 581910 582157 582215 582277 582326 582470 582573 582638 582653 ...
## $ zone_beat : chr "K3" "Q1" "K3" "F2" ...
## $ initial_type_description: chr "AUTO THEFT - VEH THEFT OR THEFT AND RECOVERY" "NARCOTICS - VIOLATIONS (LOITER, USE, SELL, NARS)" "FOOT - ELUDING POLICE" "AUTO RECOVERY" ...
## $ district_sector : Factor w/ 18 levels "99","B","C","D",..: 9 14 9 6 10 6 15 18 14 15 ...
## $ initial_type_subgroup : chr "AUTO THEFTS" "NARCOTICS COMPLAINTS" "TRAFFIC RELATED CALLS" "AUTO THEFTS" ...
## $ hundred_block_location : chr "3 AV S / S WASHINGTON ST" "20XX BLOCK OF 15 AV W" "6 AV / YESLER WY" "86XX BLOCK OF 24 AV SW" ...
## $ general_offense_number : int 201535997 201535929 201535487 201535390 201535285 201535205 201534946 201534755 201534638 201534610 ...
## $ longitude : num -122 -122 -122 -122 -122 ...
## $ latitude : num 47.6 47.6 47.6 47.5 47.7 ...
## $ at_scene_time : POSIXct, format: "2015-02-01 00:20:00" "2015-01-31 23:12:00" "2015-01-31 15:14:00" ...
## $ initial_type_group : chr "AUTO RECOVERIES" "NARCOTICS COMPLAINTS" "TRAFFIC RELATED CALLS" "AUTO RECOVERIES" ...
## $ census_tract : chr "9200.2014" "5802.2003" "9200.1002" "11401.2005" ...
We want to plot a map of Seattle that shows the areas where the crimes occurred, rather, where the crimes were reported from. If we analyze the data points on the map we can see that although the calls are spread out all over Seattle, there is a high concentration of calls that is made from the center close to Capitol Hill and Downtown Seattle. This is something that we can look into as to why the crime rate is so high in these areas.
#change the crimes data into a SpatialPointsDataFrame
coords <- cbind(longitude = as.numeric(as.character(spd911$longitude)), latitude=as.numeric(as.character(spd911$latitude)))
crime_points <- SpatialPointsDataFrame(coords, spd911[, -(9:10)])
#Plot just the points where the crime occurred without a map of the city
plot(crime_points, pch = ".", col = "darkred",cex=4)
#Create a map of seattle
map <- qmap('Seattle',zoom=11,maptype='hybrid')
#Plot the crime points on top of the map that we created to show the locations where the crimes occurred
map + geom_point(data = spd911, aes(x = spd911$longitude, y = spd911$latitude), color="red", size=3, alpha=0.4,na.rm = T)+xlab("Longitude")+ylab("Latitude")
We would also like to see what type of crimes occur by district sectors. To further analyze the data we can see their frequency in each sub sector, that is, the different zone beats that the crimes occur in which are encoded by color in the visualization.
#Create a list that contains all the district sectors in our data frame
district_sector_list<-list(as.character(unique(spd911$district_sector)))
#district_sector_list[[1]][1]
#Create a for loop that takes each individual district sectors
for(i in 1:length(district_sector_list[[1]])){
#Store the district sector in a variable
sector<-district_sector_list[[1]][i]
#Create a smaller subset data frame for a specific district sector
subset_sector<-subset(spd911,district_sector==sector)
#Use xtabs() function on a factor to get a contingency table to make it easier to create a histogram
initial_type_description_subgroup<-xtabs(~subset_sector$initial_type_subgroup)
#Modify the margin to ensure the graph and the details are all visible
par(mar=c(10,3,3,1))
#Create a barplot
barplot(initial_type_description_subgroup,col = as.factor(subset_sector$zone_beat),
ylim=c(0,20),las=2,cex.names = 0.5,xlab = "")
legend("topright",pch = 20,col=unique(as.factor(subset_sector$zone_beat)),
legend=unique(subset_sector$zone_beat),cex = 1,title = "Zone Beats")
mtext("Crimes Committed",side = 1,line = 9)
}
One of the biases of our analysis is that we have assumed that the calls to 911 were made from the same location or close to the same location as to where the crime was committed